(3) optional , predict few time teps
summarize which model choose , the training and test procedure, the evaluation criteria and the result
#import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn import linear_model
from math import sqrt
from numpy import concatenate
from pandas import concat
from pandas import DataFrame
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,RNN
from keras.layers import Dropout
from keras.models import load_model
import warnings
warnings.filterwarnings("ignore")
# read the data
df = pd.read_excel('/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/USMacroData.xls')
df.head()
df.shape
# check null values
df.isnull().sum()
# distribution plot of the columns
col_name=df.columns
plt.rc("font", size=13)
plt.rcParams["figure.figsize"] = [30,25]
alpha=0.6
for i,col in enumerate(col_name[1:]):
plt.subplot(7, 3,i+1)
sns.kdeplot(df[str(col)],shade=True, color="b")
# summary statistics
df.describe()
# check correlation between variables
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
# Inflation
import plotly.graph_objects as go
import datetime
fig = go.Figure([go.Scatter(x=df['Month'], y=df['Inflation'],line_color='red')])
fig.update_layout(title="Historical inflation rate ", xaxis_title="Year", yaxis_title="Inflation")
fig.show()
fig.write_image("images/inflation.png")
# wage
fig = go.Figure([go.Scatter(x=df['Month'], y=df['Wage'],line_color='deepskyblue')])
fig.update_layout(title="Historical Wage ", xaxis_title="Year", yaxis_title="Wage")
fig.show()
fig.write_image("images/wage.png")
# Unemployment
fig = go.Figure([go.Scatter(x=df['Month'], y=df['Unemployment'],line_color='green')])
fig.update_layout(title="Historical Unemployment ", xaxis_title="Year", yaxis_title="Unemployment")
fig.show()
fig.write_image("images/Unemployment.png")
# consumption
fig = go.Figure([go.Scatter(x=df['Month'], y=df['Consumption'],line_color='blue')])
fig.update_layout(title="Historical Consumption ", xaxis_title="Year", yaxis_title="Consumption")
fig.show()
fig.write_image("images/Consumption.png")
#investment
fig = go.Figure([go.Scatter(x=df['Month'], y=df['Investment'],line_color='orange')])
fig.update_layout(title="Historical Investment ", xaxis_title="Year", yaxis_title="Investment")
fig.show()
fig.write_image("images/Investment.png")
fig = go.Figure([go.Scatter(x=df['Month'], y=df['InterestRate'],line_color='magenta')])
fig.update_layout(title="Historical InterestRate ", xaxis_title="Year", yaxis_title="InterestRate")
fig.show()
fig.write_image("images/InterestRate.png")
# combining all plots
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Month'], y=df['Inflation'],line_color='red',name="Inflation",opacity=0.8))
fig.add_trace(go.Scatter(x=df['Month'], y=df['Wage'],line_color='deepskyblue',name="Wage",opacity=0.8))
fig.add_trace(go.Scatter(x=df['Month'], y=df['Unemployment'],line_color='green',name="Unemployment",opacity=0.8))
fig.add_trace(go.Scatter(x=df['Month'], y=df['Consumption'],line_color='blue',name="Consumption",opacity=0.8))
fig.add_trace(go.Scatter(x=df['Month'], y=df['Investment'],line_color='orange',name="Investment",opacity=0.8))
fig.add_trace(go.Scatter(x=df['Month'], y=df['InterestRate'],line_color='magenta',name="InterestRate",opacity=0.8))
# Use date string to set xaxis range
fig.update_layout(xaxis_range=['1965-01-01','2016-01-01'],title_text="USA macroeconomy variables over time",xaxis_title="Year")
fig.show()
fig.write_image("images/economy_variables.png")
We use sliding window method to predict one step forcasting at a time.
we use shift functon to shift the columns to 12 time lags
df.head()
df.shape
# normalised the data and create supervised dataframe
# dataset with input columns
df1 = df[['Inflation','Wage','Unemployment', 'Consumption','Investment', 'InterestRate']]
values = df1.values
print(values.shape)
# normalize features
# the default activation function in RNN is tanh( range -1 to 1), so prefered way to scale data in range -1 to 1 using MinMaxScalar.
# this function requre matrix form of the data so need to use reshaped data if not in proper form.
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
# function to convert data to supervised learning problem
# pandas shift function to get n-lag ( 12 months past ) for the data
# ref. from www.machinelearningmastery.com
#data: Sequence of observations as a list or 2D NumPy array
#n_in: Number of lag observations as input (X). Values may be between [1..len(data)] Optional. Defaults to 1.
#n_out: Number of observations as output (y). Values may be between [0..len(data)-1]. Optional. Defaults to 1.
#dropnan: Boolean whether or not to drop rows with NaN values. Optional. Defaults to True.
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
# prepared the data
n_past_months = 12 # specify the number of lag months-12 months
n_features = 6 # Specify number of features- 6 features
reframed = series_to_supervised(scaled, n_past_months, 1)
reframed.head()
print(reframed.shape)
Features: number of feature column use for prediction ( in our case 6 )
we take 46 years data for training and 5 years ( 60 months) data for test
# split into train and test sets
# specify the test months you want to use for testing the model
values = reframed.values
n_test_months = 60
train = values[:-n_test_months, :]
test =values[-n_test_months:,:]
target_feature = 6->Inflation, 5-> wage , 4 -> Unemployment 3-> Consumption, 2->Investment, 1->InterestRate
creating 6 models to predict 6 variables
# function for model training
# take target_feature as output variables
# train and save the model
def model_training(target_feature, model_name):
#target_feature =6
# total number of columns for model
n_col = n_past_months * n_features
global test_X
global test_y
global train_X
global train_y
train_X, train_y = train[:, :n_col], train[:, -target_feature]
test_X, test_y = test[:, :n_col], test[:, -target_feature]
#print(train_X.shape, train_y.shape)
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_past_months, n_features))
test_X = test_X.reshape((test_X.shape[0], n_past_months, n_features))
#print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
# design LSTM network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))) # (inputshape = n_months , n_features)
model.add(Dropout(0.2))
# Compiling the LSTM
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=100, validation_data=(test_X, test_y), verbose=2, shuffle=False)
#history = model.fit(train_X, train_y, epochs=100, verbose=2, shuffle=False)
# save model and architecture to single h5 file
model.save("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/"+model_name+".h5")
# plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.xlabel('epochs')
plt.ylabel('mae')
#plt.show()
plt.savefig("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/images/"+model_name+"_error.png")
plt.show()
# function to make prediction on test data ( 60 months )
# we predict current month by using past obersevations
# we made prediction up to 60'th months in test data
def model_prediction(target):
# load model
model = load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_"+target+".h5")
# summarize model
#model.summary()
# make a prediction
global test_X
global test_y
test_X = test_X.reshape((test_X.shape[0], n_past_months, n_features))
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], n_past_months*n_features))
# invert scaling for forecast
# concatinate for consistancey in inverse transform
# might need to change slicing in test_x, check it in detail
inv_yhat = concatenate((yhat, test_X[:, -5:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, -5:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
# Visualising the results
plt.plot(inv_y, color = 'red', label = 'actual value')
plt.plot(inv_yhat, color = 'blue', label = 'Predicted value')
plt.title(target+' economy variable Prediction')
plt.xlabel('Time(months)')
plt.ylabel(target)
plt.legend()
#plt.show()
plt.savefig("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/images/"+target+"_prediction.png")
plt.show()
model_training(6, 'model_Inflation')
model_prediction('Inflation')
df.columns
model_training(5, 'model_Wage')
model_prediction('Wage')
model_training(4, 'model_Unemployment')
model_prediction('Unemployment')
model_training(3, 'model_Consumption')
model_prediction('Consumption')
model_training(2, 'model_Investment')
model_prediction('Investment')
model_training(1, 'model_InterestRate')
model_prediction('InterestRate')
# load saved models
model_Inflation=load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_Inflation.h5")
model_Wage=load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_Wage.h5")
model_Unemployment=load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_Unemployment.h5")
model_Consumption=load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_Consumption.h5")
model_Investment=load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_Investment.h5")
model_InterestRate=load_model("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/models/model_InterestRate.h5")
# initiate with the last elements of the test data set and predict all variables for next time steps
# we predict for future 6 months Inflation rate
# list of models
model_list=[model_Inflation,model_Wage,model_Unemployment,model_Consumption,model_Investment,model_InterestRate]
# extracting last 12 months of data for year 2015 so we can predict Jan-2016 and
# then using moving forward will predict Feb, March,April, MAy , June 2016.
tmp= test_X[59].reshape(1,n_past_months, n_features)
for i in [1,6]:
ypred=[]
for model in model_list:
ypred.append(model.predict(tmp))
# combined individual arrays to one array
ypred=np.concatenate(ypred)
ypred
# reshape ypred to combine with input for next iternation
ypred=ypred.reshape(1,6)
# reshap intial input
tmp=tmp.reshape(1,n_past_months*n_features)
# combined prediction with intial input and update input for next window
tmp=concatenate((tmp,ypred),axis=1)
tmp=tmp[0][-72:]
# reshape tmp to pass in model
tmp=tmp.reshape(1,12,6)
# Plot the prediction for future months
# invert scaling for forecast
# concatinate for consistancey in inverse transform
# might need to change slicing in test_x, check it in detail
inv_tmp=scaler.inverse_transform(tmp.reshape(12,6))
tt=concatenate((df.Inflation[-60:].values,inv_tmp[:,0]),axis=0)
plt.plot(df.Inflation[-60:].values, color = 'b', label = 'actual value')
plt.plot(tt, color = 'b', label = 'Predicted value',linestyle='dashed',markerfacecolor='blue')
plt.title('Inflation economy variable Prediction')
plt.xlabel('Time ( months)')
plt.ylabel('Inflation')
plt.legend()
plt.savefig("/Users/akshay/Downloads/ADP_challange/ADP_lead_datascientist/images/future_month_prediction.png")
plt.show()